%run tweets_classes.ipynb
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
This project will compare collaborative filtering-based recommender and hybrid recommender systems. To build the collaborative filtering-based recommender, it is essential to use the Matrix Factorization technique, and Embedding layer from the PyTorch package to build the hybrid recommender.
m_cols = ['movie_id','movie_title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Children','Comedy',
'Crime','Documentary','Drama','Fantasy','Film-noir','Horror','Musical','Mystery','Romance','sci_fi','Thriller','War','Western']
movies_info = pd.read_csv('u.item', sep='|', names=m_cols, encoding='latin-1')
movies_info.head()
| movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 2 | GoldenEye (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?GoldenEye%20(... | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 3 | Four Rooms (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Four%20Rooms%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 4 | Get Shorty (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Get%20Shorty%... | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | Copycat (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Copycat%20(1995) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 24 columns
movies_info.shape
(1682, 24)
m_cols = ['user_id','item_id','rating','timestamp']
ratings = pd.read_csv('u.data', sep='\t',names = m_cols)
ratings.head()
| user_id | item_id | rating | timestamp | |
|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 881250949 |
| 1 | 186 | 302 | 3 | 891717742 |
| 2 | 22 | 377 | 1 | 878887116 |
| 3 | 244 | 51 | 2 | 880606923 |
| 4 | 166 | 346 | 1 | 886397596 |
ratings.shape
(100000, 4)
m_cols = ['genre','id']
genres = pd.read_csv('u.genre', sep='|',names = m_cols)
genres.head()
| genre | id | |
|---|---|---|
| 0 | unknown | 0 |
| 1 | Action | 1 |
| 2 | Adventure | 2 |
| 3 | Animation | 3 |
| 4 | Children's | 4 |
genres.shape
(19, 2)
Given the u.item dataset, extract a tagword using the movie name. Then, search Twitter using that tagword to download related tweets for that movie. Do the same for each movie. Clean the tweets, and then calculate the sentiment score for each tweet for each movie. Calculate the average sentiment score for each movie and save the average as the sentiment score for that movie.
def get_tag_word(s):
t = re.sub(r"(\(.*?\))", '', s)
t = t.replace(" ", "")
return t
movies_info['keyword'] = movies_info['movie_title'].apply(lambda s: get_tag_word(s))
movies_info.head()
| movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | ... | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | keyword | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ToyStory |
| 1 | 2 | GoldenEye (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?GoldenEye%20(... | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | GoldenEye |
| 2 | 3 | Four Rooms (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Four%20Rooms%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | FourRooms |
| 3 | 4 | Get Shorty (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Get%20Shorty%... | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GetShorty |
| 4 | 5 | Copycat (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Copycat%20(1995) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Copycat |
5 rows × 25 columns
to extract the information, we used the snscrape package. We built a class called "Scraper" that contains the necessary functions to do the scraping. All the extracted information will be stored in csv files, so they can be later called by a pandas dataframe.
date = datetime.datetime(2023, 4, 7)
# Here we generates a list of the last 60 days before 04/07/2023
dates = [ date + datetime.timedelta(days = i) for i in range(-60, 0) ]
# We define a fixed query variable to make the extraction
fixed_query = ''
# The following list builds a list of the relevant keywords for searching
keywords = list(movies_info['keyword'])
dates
[datetime.datetime(2023, 2, 6, 0, 0), datetime.datetime(2023, 2, 7, 0, 0), datetime.datetime(2023, 2, 8, 0, 0), datetime.datetime(2023, 2, 9, 0, 0), datetime.datetime(2023, 2, 10, 0, 0), datetime.datetime(2023, 2, 11, 0, 0), datetime.datetime(2023, 2, 12, 0, 0), datetime.datetime(2023, 2, 13, 0, 0), datetime.datetime(2023, 2, 14, 0, 0), datetime.datetime(2023, 2, 15, 0, 0), datetime.datetime(2023, 2, 16, 0, 0), datetime.datetime(2023, 2, 17, 0, 0), datetime.datetime(2023, 2, 18, 0, 0), datetime.datetime(2023, 2, 19, 0, 0), datetime.datetime(2023, 2, 20, 0, 0), datetime.datetime(2023, 2, 21, 0, 0), datetime.datetime(2023, 2, 22, 0, 0), datetime.datetime(2023, 2, 23, 0, 0), datetime.datetime(2023, 2, 24, 0, 0), datetime.datetime(2023, 2, 25, 0, 0), datetime.datetime(2023, 2, 26, 0, 0), datetime.datetime(2023, 2, 27, 0, 0), datetime.datetime(2023, 2, 28, 0, 0), datetime.datetime(2023, 3, 1, 0, 0), datetime.datetime(2023, 3, 2, 0, 0), datetime.datetime(2023, 3, 3, 0, 0), datetime.datetime(2023, 3, 4, 0, 0), datetime.datetime(2023, 3, 5, 0, 0), datetime.datetime(2023, 3, 6, 0, 0), datetime.datetime(2023, 3, 7, 0, 0), datetime.datetime(2023, 3, 8, 0, 0), datetime.datetime(2023, 3, 9, 0, 0), datetime.datetime(2023, 3, 10, 0, 0), datetime.datetime(2023, 3, 11, 0, 0), datetime.datetime(2023, 3, 12, 0, 0), datetime.datetime(2023, 3, 13, 0, 0), datetime.datetime(2023, 3, 14, 0, 0), datetime.datetime(2023, 3, 15, 0, 0), datetime.datetime(2023, 3, 16, 0, 0), datetime.datetime(2023, 3, 17, 0, 0), datetime.datetime(2023, 3, 18, 0, 0), datetime.datetime(2023, 3, 19, 0, 0), datetime.datetime(2023, 3, 20, 0, 0), datetime.datetime(2023, 3, 21, 0, 0), datetime.datetime(2023, 3, 22, 0, 0), datetime.datetime(2023, 3, 23, 0, 0), datetime.datetime(2023, 3, 24, 0, 0), datetime.datetime(2023, 3, 25, 0, 0), datetime.datetime(2023, 3, 26, 0, 0), datetime.datetime(2023, 3, 27, 0, 0), datetime.datetime(2023, 3, 28, 0, 0), datetime.datetime(2023, 3, 29, 0, 0), datetime.datetime(2023, 3, 30, 0, 0), datetime.datetime(2023, 3, 31, 0, 0), datetime.datetime(2023, 4, 1, 0, 0), datetime.datetime(2023, 4, 2, 0, 0), datetime.datetime(2023, 4, 3, 0, 0), datetime.datetime(2023, 4, 4, 0, 0), datetime.datetime(2023, 4, 5, 0, 0), datetime.datetime(2023, 4, 6, 0, 0)]
fixed_query
''
keywords
['ToyStory', 'GoldenEye', 'FourRooms', 'GetShorty', 'Copycat', 'ShanghaiTriad', 'TwelveMonkeys', 'Babe', 'DeadManWalking', 'RichardIII', 'Seven', 'UsualSuspects,The', 'MightyAphrodite', 'Postino,Il', "Mr.Holland'sOpus", 'FrenchTwist', 'FromDuskTillDawn', 'WhiteBalloon,The', "Antonia'sLine", 'AngelsandInsects', 'MuppetTreasureIsland', 'Braveheart', 'TaxiDriver', 'RumbleintheBronx', 'Birdcage,The', 'BrothersMcMullen,The', 'BadBoys', 'Apollo13', 'BatmanForever', 'Belledejour', 'CrimsonTide', 'Crumb', 'Desperado', 'DoomGeneration,The', 'FreeWilly2:TheAdventureHome', 'MadLove', 'Nadja', 'Net,The', 'StrangeDays', 'ToWongFoo,ThanksforEverything!JulieNewmar', 'BillyMadison', 'Clerks', 'Disclosure', 'DoloresClaiborne', 'EatDrinkManWoman', 'Exotica', 'EdWood', 'HoopDreams', 'I.Q.', 'StarWars', 'LegendsoftheFall', 'MadnessofKingGeorge,The', 'NaturalBornKillers', 'Outbreak', 'Professional,The', 'PulpFiction', 'Priest', 'QuizShow', 'ThreeColors:Red', 'ThreeColors:Blue', 'ThreeColors:White', 'Stargate', 'SantaClause,The', 'ShawshankRedemption,The', "What'sEatingGilbertGrape", 'WhileYouWereSleeping', 'AceVentura:PetDetective', 'Crow,The', 'ForrestGump', 'FourWeddingsandaFuneral', 'LionKing,The', 'Mask,The', 'Maverick', 'FasterPussycat!Kill!Kill!', 'BrotherMinister:TheAssassinationofMalcolmX', "Carlito'sWay", 'Firm,The', 'FreeWilly', 'Fugitive,The', 'HotShots!PartDeux', 'HudsuckerProxy,The', 'JurassicPark', 'MuchAdoAboutNothing', "RobertA.Heinlein'sThePuppetMasters", 'Ref,The', 'RemainsoftheDay,The', 'SearchingforBobbyFischer', 'SleeplessinSeattle', 'BladeRunner', 'SoIMarriedanAxeMurderer', 'NightmareBeforeChristmas,The', 'TrueRomance', 'WelcometotheDollhouse', 'HomeAlone', 'Aladdin', 'Terminator2:JudgmentDay', 'DanceswithWolves', 'SilenceoftheLambs,The', 'SnowWhiteandtheSevenDwarfs', 'Fargo', 'HeavyMetal', 'Aristocats,The', 'AllDogsGotoHeaven2', 'TheodoreRex', 'Sgt.Bilko', 'Diabolique', 'MollFlanders', 'KidsintheHall:BrainCandy', 'MysteryScienceTheater3000:TheMovie', 'OperationDumboDrop', 'TruthAboutCats&Dogs,The', 'Flipper', 'HorsemanontheRoof,The', 'Wallace&Gromit:TheBestofAardmanAnimation', 'HauntedWorldofEdwardD.WoodJr.,The', 'ColdComfortFarm', 'Rock,The', 'Twister', 'MayaLin:AStrongClearVision', 'Striptease', 'IndependenceDay', 'CableGuy,The', 'Frighteners,The', 'LoneStar', 'Phenomenon', 'SpitfireGrill,The', 'Godfather,The', 'Supercop', 'Bound', 'KansasCity', "BreakfastatTiffany's", 'WizardofOz,The', 'GonewiththeWind', 'CitizenKane', '2001:ASpaceOdyssey', 'Mr.SmithGoestoWashington', 'BigNight', 'D3:TheMightyDucks', 'LoveBug,The', 'HomewardBound:TheIncredibleJourney', '20,000LeaguesUndertheSea', 'BedknobsandBroomsticks', 'SoundofMusic,The', 'DieHard', 'LawnmowerMan,The', 'UnhooktheStars', 'LongKissGoodnight,The', 'GhostandtheDarkness,The', 'Jude', 'Swingers', 'WillyWonkaandtheChocolateFactory', 'Sleeper', 'FishCalledWanda,A', "MontyPython'sLifeofBrian", 'DirtyDancing', 'ReservoirDogs', 'Platoon', "WeekendatBernie's", 'BasicInstinct', 'GlengarryGlenRoss', 'TopGun', 'OnGoldenPond', 'ReturnofthePinkPanther,The', 'Abyss,The', 'JeandeFlorette', 'ManonoftheSpring', 'PrivateBenjamin', 'MontyPythonandtheHolyGrail', 'WrongTrousers,The', 'CinemaParadiso', 'Delicatessen', 'EmpireStrikesBack,The', 'PrincessBride,The', 'RaidersoftheLostArk', 'Brazil', 'Aliens', 'Good,TheBadandTheUgly,The', '12AngryMen', 'ClockworkOrange,A', 'ApocalypseNow', 'ReturnoftheJedi', 'GoodFellas', 'Alien', 'ArmyofDarkness', 'Psycho', 'BluesBrothers,The', 'Godfather:PartII,The', 'FullMetalJacket', 'GrandDayOut,A', 'HenryV', 'Amadeus', 'RagingBull', 'RightStuff,The', 'Sting,The', 'Terminator,The', 'DeadPoetsSociety', 'Graduate,The', 'Nikita', 'BridgeontheRiverKwai,The', 'Shining,The', 'EvilDeadII', 'GroundhogDay', 'Unforgiven', 'BacktotheFuture', 'Patton', 'Akira', 'CyranodeBergerac', 'YoungFrankenstein', 'ThisIsSpinalTap', 'IndianaJonesandtheLastCrusade', 'M*A*S*H', 'UnbearableLightnessofBeing,The', 'RoomwithaView,A', 'PinkFloyd-TheWall', 'FieldofDreams', 'WhenHarryMetSally...', "BramStoker'sDracula", 'CapeFear', 'NightmareonElmStreet,A', 'MirrorHasTwoFaces,The', 'BreakingtheWaves', 'StarTrek:FirstContact', 'SlingBlade', 'Ridicule', '101Dalmatians', 'DieHard2', 'StarTrekVI:TheUndiscoveredCountry', 'StarTrek:TheWrathofKhan', 'StarTrekIII:TheSearchforSpock', 'StarTrekIV:TheVoyageHome', 'BatmanReturns', 'YoungGuns', 'UnderSiege', 'Jaws', 'MarsAttacks!', 'CitizenRuth', 'JerryMaguire', 'RaisingArizona', 'Sneakers', 'BeavisandButt-headDoAmerica', 'LastoftheMohicans,The', 'Kolya', 'Jungle2Jungle', "Smilla'sSenseofSnow", "Devil'sOwn,The", 'ChasingAmy', 'Turbo:APowerRangersMovie', 'GrossePointeBlank', 'AustinPowers:InternationalManofMystery', 'FifthElement,The', 'ShallWeDance?', 'LostWorld:JurassicPark,The', 'PillowBook,The', 'Batman&Robin', "MyBestFriend'sWedding", 'WhentheCatsAway', 'MeninBlack', 'Contact', 'GeorgeoftheJungle', 'EventHorizon', 'AirBud', 'IntheCompanyofMen', 'Steel', 'Mimic', 'HuntforRedOctober,The', 'KulltheConqueror', 'unknown', 'ChasingAmy', 'FullMonty,The', 'Gattaca', 'StarshipTroopers', 'GoodWillHunting', 'Heat', 'Sabrina', 'SenseandSensibility', 'LeavingLasVegas', 'Restoration', 'BedofRoses', 'OnceUponaTime...WhenWeWereColored', 'UpCloseandPersonal', 'RiverWild,The', 'TimetoKill,A', 'Emma', 'TinCup', 'Secrets&Lies', 'EnglishPatient,The', "Marvin'sRoom", 'Scream', 'Evita', 'FierceCreatures', 'AbsolutePower', 'Rosewood', 'DonnieBrasco', 'LiarLiar', 'Breakdown', 'Promesse,La', "Ulee'sGold", 'Face/Off', 'Hoodlum', 'AirForceOne', 'In&Out', 'L.A.Confidential', "Ulee'sGold", 'FlyAwayHome', 'IceStorm,The', 'Mrs.Brown', "Devil'sAdvocate,The", 'FairyTale:ATrueStory', 'Deceiver', 'Rainmaker,The', 'WingsoftheDove,The', 'MidnightintheGardenofGoodandEvil', 'Titanic', '3Ninjas:HighNoonAtMegaMountain', 'AptPupil', 'AsGoodAsItGets', 'IntheNameoftheFather', "Schindler'sList", 'EveryoneSaysILoveYou', 'ParadiseLost:TheChildMurdersatRobinHoodHills', 'Mother', 'Murderat1600', "Dante'sPeak", 'LostHighway', 'Crash', 'G.I.Jane', 'CopLand', 'ConspiracyTheory', 'DesperateMeasures', '187', 'Edge,The', 'KisstheGirls', 'Game,The', 'UTurn', 'HowtoBeaPlayer', 'PlayingGod', 'HouseofYes,The', 'Bean', 'MadCity', 'BoogieNights', 'CriticalCare', 'ManWhoKnewTooLittle,The', 'Alien:Resurrection', 'Apostle,The', 'DeconstructingHarry', 'JackieBrown', 'WagtheDog', 'DesperateMeasures', 'HardRain', 'Fallen', 'ProphecyII,The', 'SpiceWorld', 'DeepRising', 'WeddingSinger,The', 'Sphere', 'Client,The', "OneFlewOvertheCuckoo'sNest", 'Spawn', 'Assignment,The', 'Wonderland', 'Incognito', 'BluesBrothers2000', 'SuddenDeath', 'AceVentura:WhenNatureCalls', 'Powder', 'DangerousMinds', 'Clueless', 'Bio-Dome', 'BlackSheep', 'MaryReilly', 'BridgesofMadisonCounty,The', 'Jeffrey', 'JudgeDredd', 'MightyMorphinPowerRangers:TheMovie', 'Showgirls', 'Houseguest', 'Heavyweights', 'Miracleon34thStreet', 'TalesFromtheCryptPresents:DemonKnight', 'StarTrek:Generations', "Muriel'sWedding", 'AdventuresofPriscilla,QueenoftheDesert,The', 'Flintstones,The', 'NakedGun331/3:TheFinalInsult', 'TrueLies', 'AddamsFamilyValues', 'AgeofInnocence,The', 'BeverlyHillsCopIII', 'BlackBeauty', 'FearofaBlackHat', 'LastActionHero', 'ManWithoutaFace,The', 'Mrs.Doubtfire', 'RadiolandMurders', 'RobinHood:MeninTights', 'SerialMom', 'StrikingDistance', 'SuperMarioBros.', 'ThreeMusketeers,The', 'LittleRascals,The', 'BradyBunchMovie,The', 'Ghost', 'Batman', 'Pinocchio', 'Mission:Impossible', 'Thinner', 'SpyHard', 'CloseShave,A', 'Jack', 'Kingpin', 'NuttyProfessor,The', 'VeryBradySequel,A', 'TalesfromtheCryptPresents:BordelloofBlood', 'MyFavoriteYear', 'AppleDumplingGang,The', 'OldYeller', 'ParentTrap,The', 'Cinderella', 'MaryPoppins', 'AliceinWonderland', "WilliamShakespeare'sRomeoandJuliet", 'AladdinandtheKingofThieves', 'E.T.theExtra-Terrestrial', 'ChildrenoftheCorn:TheGathering', 'BobRoberts', 'Transformers:TheMovie,The', 'ToKillaMockingbird', 'HaroldandMaude', 'DaytheEarthStoodStill,The', 'DuckSoup', 'Highlander', 'Fantasia', 'Heathers', 'ForbiddenPlanet', 'ButchCassidyandtheSundanceKid', 'AmericanWerewolfinLondon,An', "Amityville1992:It'sAboutTime", 'Amityville3-D', 'Amityville:ANewGeneration', 'AmityvilleII:ThePossession', 'AmityvilleHorror,The', 'AmityvilleCurse,The', 'Birds,The', 'Blob,The', 'BodySnatcher,The', 'BurntOfferings', 'Carrie', 'Omen,The', 'StarTrek:TheMotionPicture', 'StarTrekV:TheFinalFrontier', 'Grease', 'Jaws2', 'Jaws3-D', 'BastardOutofCarolina', "JackieChan'sFirstStrike", 'BeverlyHillsNinja', 'FreeWilly3:TheRescue', 'Nixon', 'Cry,theBelovedCountry', 'CrossingGuard,The', 'Smoke', 'LikeWaterForChocolate', 'SecretofRoanInish,The', 'Vanyaon42ndStreet', 'JungleBook,The', 'RedRockWest', 'BronxTale,A', 'Rudy', 'ShortCuts', 'Tombstone', 'CourageUnderFire', 'Dragonheart', 'JamesandtheGiantPeach', 'Dr.Strangeloveor:HowILearnedtoStopWorryingandLovetheBomb', 'Trainspotting', 'FirstWivesClub,The', 'Matilda', 'PhiladelphiaStory,The', 'Vertigo', 'NorthbyNorthwest', 'Apartment,The', 'SomeLikeItHot', 'Casablanca', 'MalteseFalcon,The', 'MyFairLady', 'Sabrina', 'RomanHoliday', 'SunsetBlvd.', 'Notorious', 'ToCatchaThief', 'AdventuresofRobinHood,The', 'EastofEden', 'ThinMan,The', 'HisGirlFriday', 'AroundtheWorldin80Days', "It'saWonderfulLife", 'BringingUpBaby', 'AfricanQueen,The', 'CatonaHotTinRoof', 'FlyAwayHome', 'Dumbo', 'Bananas', 'Candidate,The', 'BonnieandClyde', 'DialMforMurder', 'RebelWithoutaCause', 'StreetcarNamedDesire,A', 'Peoplevs.LarryFlynt,The', 'MyLeftFoot', 'MagnificentSeven,The', 'LawrenceofArabia', 'WingsofDesire', 'ThirdMan,The', 'AnnieHall', 'Boot,Das', 'LocalHero', 'Manhattan', "Miller'sCrossing", 'TreasureoftheSierraMadre,The', 'GreatEscape,The', 'DeerHunter,The', 'DownbyLaw', 'CoolHandLuke', 'GreatDictator,The', 'BigSleep,The', 'Ben-Hur', 'Gandhi', 'KillingFields,The', 'MyLifeasaDog', 'ManWhoWouldBeKing,The', 'Shine', 'KamaSutra:ATaleofLove', 'Daytrippers,The', 'Traveller', 'AddictedtoLove', 'Ponette', 'MyOwnPrivateIdaho', 'Anastasia', 'MouseHunt', 'MoneyTrain', 'MortalKombat', 'Pocahontas', 'Misérables,Les', "ThingstoDoinDenverwhenYou'reDead", 'VampireinBrooklyn', 'BrokenArrow', "YoungPoisoner'sHandbook,The", 'NeverEndingStoryIII,The', 'RobRoy', 'DieHard:WithaVengeance', 'LordofIllusions', 'Species', 'WalkintheClouds,A', 'Waterworld', "WhiteMan'sBurden", 'WildBill', 'Farinelli:ilcastrato', 'HeavenlyCreatures', 'InterviewwiththeVampire', "KidinKingArthur'sCourt,A", "MaryShelley'sFrankenstein", 'QuickandtheDead,The', "StephenKing'sTheLangoliers", 'TalesfromtheHood', 'VillageoftheDamned', 'ClearandPresentDanger', "WesCraven'sNewNightmare", 'Speed', 'Wolf', 'WyattEarp', 'AnotherStakeout', 'BlownAway', 'BodySnatchers', 'BoxingHelena', "CitySlickersII:TheLegendofCurly'sGold", 'Cliffhanger', 'Coneheads', 'DemolitionMan', 'FatalInstinct', 'EnglishmanWhoWentUpaHill,ButCameDownaMountain,The', 'Kalifornia', 'Piano,The', 'RomeoIsBleeding', 'SecretGarden,The', 'SoninLaw', 'TerminalVelocity', 'HourofthePig,The', 'BeautyandtheBeast', 'WildBunch,The', 'Hellraiser:Bloodline', 'PrimalFear', 'TrueCrime', 'Stalingrad', 'Heavy', 'Fan,The', 'HunchbackofNotreDame,The', 'Eraser', 'BigSqueeze,The', 'PoliceStory4:ProjectS', "DanielDefoe'sRobinsonCrusoe", 'ForWhomtheBellTolls', 'AmericaninParis,An', 'RearWindow', 'ItHappenedOneNight', 'MeetMeinSt.Louis', 'AllAboutEve', 'Rebecca', 'Spellbound', 'FatheroftheBride', 'Gigi', 'Laura', 'LostHorizon', 'MyManGodfrey', 'Giant', '39Steps,The', 'NightoftheLivingDead', 'BlueAngel,The', 'Picnic', 'ExtremeMeasures', 'Chamber,The', 'DavyCrockett,KingoftheWildFrontier', 'SwissFamilyRobinson', 'AngelsintheOutfield', 'ThreeCaballeros,The', 'SwordintheStone,The', 'SoDeartoMyHeart', 'RobinHood:PrinceofThieves', 'Sleepers', 'Victor/Victoria', 'GreatRace,The', 'CryingGame,The', "Sophie'sChoice", 'ChristmasCarol,A', "Microcosmos:Lepeupledel'herbe", 'Fog,The', 'EscapefromNewYork', 'Howling,The', 'ReturnofMartinGuerre,The', 'TinDrum,The', 'CooktheThiefHisWife&HerLover,The', 'PathsofGlory', 'Grifters,The', 'TheInnocent', 'ThinBlueLine,The', 'ParisIsBurning', 'OnceUponaTimeintheWest', 'Ran', 'QuietMan,The', 'OnceUponaTimeinAmerica', 'SeventhSeal,The', 'Glory', 'RosencrantzandGuildensternAreDead', 'TouchofEvil', 'Chinatown', 'StandbyMe', 'M', 'ManchurianCandidate,The', 'PumpUptheVolume', 'ArsenicandOldLace', 'FriedGreenTomatoes', 'HighNoon', 'SomewhereinTime', 'BeingThere', 'Paris,Texas', 'Alien3', 'BloodForDracula', 'AudreyRose', 'BloodBeach', 'BodyParts', 'BodySnatchers', 'BrideofFrankenstein', 'Candyman', 'CapeFear', 'CatPeople', 'Nosferatu', 'Crucible,The', 'FireontheMountain', 'Volcano', 'ConantheBarbarian', 'KulltheConqueror', 'Wishmaster', 'IKnowWhatYouDidLastSummer', 'RocketMan', 'IntheLineofFire', 'ExecutiveDecision', 'PerfectWorld,A', "McHale'sNavy", 'LeaveIttoBeaver', 'Jackal,The', 'SevenYearsinTibet', 'DarkCity', 'AmericanPresident,The', 'Casino', 'Persuasion', 'KickingandScreaming', 'CityHall', 'BasketballDiaries,The', 'BrowningVersion,The', 'LittleWomen', 'MiamiRhapsody', 'Wonderful,HorribleLifeofLeniRiefenstahl,The', 'Barcelona', "Widows'Peak", 'HouseoftheSpirits,The', "Singin'intheRain", 'BadMoon', 'EnchantedApril', 'Sex,Lies,andVideotape', 'StrictlyBallroom', 'BetterOffDead...', 'SubstanceofFire,The', 'TinMen', 'Othello', 'Carrington', 'ToDieFor', 'HomefortheHolidays', 'Juror,The', 'IntheBleakMidwinter', 'CanadianBacon', 'FirstKnight', 'Mallrats', 'NineMonths', 'BoysontheSide', 'CircleofFriends', 'ExittoEden', 'Fluke', 'ImmortalBeloved', 'Junior', 'Nell', 'QueenMargot', 'Corrina,Corrina', 'Dave', 'GoFish', 'MadeinAmerica', 'Philadelphia', 'Shadowlands', 'Sirens', 'Threesome', 'PrettyWoman', 'JaneEyre', 'LastSupper,The', 'Ransom', 'Crow:CityofAngels,The', 'MichaelCollins', 'RulingClass,The', 'RealGenius', 'Benny&Joon', 'Saint,The', 'MatchMaker,The', 'Amistad', 'TomorrowNeverDies', 'ReplacementKillers,The', 'BurntBytheSun', 'RedCorner', 'Jumanji', 'FatheroftheBridePartII', 'AcrosstheSeaofTime', 'LawnmowerMan2:BeyondCyberspace', 'FairGame', 'Screamers', 'NickofTime', 'BeautifulGirls', 'HappyGilmore', 'IfLucyFell', 'Boomerang', 'ManoftheYear', 'Addiction,The', 'Casper', 'Congo', 'DevilinaBlueDress', 'JohnnyMnemonic', 'Kids', 'MuteWitness', 'Prophecy,The', 'SomethingtoTalkAbout', 'ThreeWishes', 'CastleFreak', 'DonJuanDeMarco', 'DropZone', 'Dumb&Dumber', 'FrenchKiss', 'LittleOdessa', 'MilkMoney', 'BeyondBedlam', 'OnlyYou', 'PerezFamily,The', 'Roommates', 'RelativeFear', 'SwimmingwithSharks', 'TommyBoy', 'Baby-SittersClub,The', 'BulletsOverBroadway', 'Crooklyn', 'ItCouldHappentoYou', 'RichieRich', 'Speechless', 'Timecop', 'BadCompany', 'BoysLife', 'IntheMouthofMadness', 'AirUpThere,The', 'HardTarget', 'Heaven&Earth', 'JimmyHollywood', 'ManhattanMurderMystery', 'MenaceIISociety', 'PoeticJustice', 'Program,The', 'RisingSun', 'Shadow,The', 'Thirty-TwoShortFilmsAboutGlennGould', 'Andre', 'CelluloidCloset,The', 'GreatDayinHarlem,A', 'OneFineDay', 'Candyman:FarewelltotheFlesh', 'Frisk', 'Girl6', 'Eddie', 'SpaceJam', 'Mrs.Winterbourne', 'Faces', 'MulhollandFalls', 'GreatWhiteHype,The', 'Arrival,The', 'Phantom,The', 'Daylight', 'Alaska', 'Fled', 'Power98', 'EscapefromL.A.', 'Bogus', 'Bulletproof', 'Halloween:TheCurseofMichaelMyers', 'GayDivorcee,The', 'Ninotchka', 'MeetJohnDoe', 'IntheLineofDuty2', 'LochNess', 'LastManStanding', 'GlimmerMan,The', 'Pollyanna', 'ShaggyDog,The', 'Freeway', 'ThatThingYouDo!', 'ToGillianonHer37thBirthday', 'LookingforRichard', 'Murder,MySweet', 'DaysofThunder', 'PerfectCandidate,A', 'TwoorThreeThingsIKnowAboutHer', 'BloodyChild,The', 'Braindead', 'BadTaste', 'Diva', 'NightonEarth', 'ParisWasaWoman', 'Amityville:Dollhouse', "AprilFool'sDay", 'Believers,The', 'NosferatuaVenezia', 'JingleAlltheWay', 'GardenofFinzi-Contini,The', 'MyFellowAmericans', 'IceStorm,The', 'Michael', 'WholeWideWorld,The', 'HeartsandMinds', 'FoolsRushIn', 'Touch', 'VegasVacation', 'LoveJones', 'PicturePerfect', 'CareerGirls', "She'sSoLovely", 'MoneyTalks', 'ExcessBaggage', 'ThatDarnCat!', 'Peacemaker,The', 'SoulFood', 'MoneyTalks', 'WashingtonSquare', 'TellingLiesinAmerica', 'YearoftheHorse', 'Phantoms', 'LifeLessOrdinary,A', "Eve'sBayou", 'OneNightStand', 'TangoLesson,The', 'MortalKombat:Annihilation', 'Bent', 'Flubber', 'ForRicherorPoorer', 'HomeAlone3', 'Scream2', 'SweetHereafter,The', 'TimeTracers', 'Postman,The', 'WinterGuest,The', 'Kundun', 'Mr.Magoo', 'BigLebowski,The', 'Afterglow', 'Mavieenrose', 'GreatExpectations', 'Oscar&Lucinda', 'Vermin', 'HalfBaked', 'DangerousBeauty', 'NilByMouth', 'Twilight', 'U.S.Marshalls', 'LoveandDeathonLongIsland', 'WildThings', 'PrimaryColors', 'LostinSpace', 'MercuryRising', 'CityofAngels', 'CityofLostChildren,The', 'TwoBits', 'FarewellMyConcubine', 'DeadMan', 'RaisetheRedLantern', 'WhiteSquall', 'Unforgettable', 'DownPeriscope', 'FlowerofMySecret,The', 'Craft,The', 'HarriettheSpy', 'ChainReaction', 'IslandofDr.Moreau,The', 'FirstKid', 'Funeral,The', "Preacher'sWife,The", 'ParadiseRoad', 'BrassedOff', 'ThousandAcres,A', 'SmileLikeYours,A', 'MurderintheFirst', 'Airheads', 'WithHonors', "What'sLoveGottoDowithIt", 'KillingZoe', 'RenaissanceMan', 'Charade', 'FoxandtheHound,The', 'BigBlue,The', 'BootyCall', 'HowtoMakeanAmericanQuilt', 'Georgia', 'IndianintheCupboard,The', 'BlueintheFace', 'UnstrungHeroes', 'Unzipped', 'BeforeSunrise', "Nobody'sFool", 'PushingHands', 'ToLive', 'DazedandConfused', 'Naked', 'Orlando', 'RubyinParadise', 'SomeFolksCallItaSlingBlade', 'MonthbytheLake,A', 'FunnyFace', 'AffairtoRemember,An', 'LittleLordFauntleroy', 'InspectorGeneral,The', 'WinniethePoohandtheBlusteryDay', 'HearMySong', 'Mediterraneo', 'PassionFish', 'GratefulDead', 'EyeforanEye', 'Fear', 'Solo', 'Substitute,The', "Heaven'sPrisoners", 'TriggerEffect,The', 'MotherNight', 'DangerousGround', 'MaximumRisk', "RichMan'sWife,The", 'ShadowConspiracy', 'Blood&Wine', 'Turbulence', 'Underworld', 'BeauticianandtheBeast,The', "CatsDon'tDance", 'AnnaKarenina', 'KeystoTulsa', 'HeadAboveWater', 'Hercules', 'LastTimeICommittedSuicide,The', 'KissMe,Guido', 'BigGreen,The', 'StuartSavesHisFamily', 'CabinBoy', 'CleanSlate', 'LightningJack', ...]
https://drive.google.com/file/d/1Q_dhdirnxfNlVuFXgQOXKUQyTta45ALQ/view?usp=share_link
# here we load the csv file to a dataframe
tweets = pd.read_csv('./data/complete_input.csv')
# Here is a sample of the dataset
tweets.head()
| Unnamed: 0.1 | Unnamed: 0 | id | url | date | content | keyword | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1515818269026013185 | https://twitter.com/OramaStore/status/15158182... | 2022-04-17 22:23:32+00:00 | #love #toystory #trendy #onlineshop Green Toys... | #ToyStory |
| 1 | 1 | 1 | 1515816410261430281 | https://twitter.com/ComeOnMimi/status/15158164... | 2022-04-17 22:16:09+00:00 | @lolaindigoarg @Los40 @Del40al1_es Voto por #T... | #ToyStory |
| 2 | 2 | 2 | 1515810680615710730 | https://twitter.com/CDSXCalibur/status/1515810... | 2022-04-17 21:53:23+00:00 | Trying to be #Patient when coming to #Walmart ... | #ToyStory |
| 3 | 3 | 3 | 1515808758080978950 | https://twitter.com/cstephens2/status/15158087... | 2022-04-17 21:45:44+00:00 | Jessie was absolutely enthralled by her mini-m... | #ToyStory |
| 4 | 4 | 4 | 1515797388304302083 | https://twitter.com/lolaindigogrn/status/15157... | 2022-04-17 21:00:34+00:00 | @lolaindigoarg @entreotroscien @Los40 @Del40al... | #ToyStory |
tweets.shape
(784138, 8)
import seaborn as sns
num_tweets = list(tweets['keyword'].value_counts())
num_tweets_df = pd.DataFrame(num_tweets,columns = ['num_tweets'])
num_tweets_df['num'] = num_tweets_df.index
sns.lineplot(data=num_tweets_df, x="num", y="num_tweets")
<AxesSubplot:xlabel='num', ylabel='num_tweets'>
We created a class called Cleaner which has the necessary methods to clean the text:
* Remove emojis
* Remove URLs
* Remove Numbers and non alhpa characters
* Remove punctuation
* Remove stop words
* Tokenization
After this steps we can obtain a new tokenized text column called: content_clean# Here we convert the date column to datetime, and also create a create an additional column with yyyy-mm-dd format.
tweets['datetime'] = pd.to_datetime(tweets['date'])
tweets['date'] = tweets['datetime'].dt.normalize().dt.strftime('%Y-%m-%d')
# Here we create the cleaner object and execute the clean functions
cleaner = Cleaner(tweets, ['date','content','keyword'])
cleaner.drop_duplicates()
cleaner.extract_content()
cleaner.extract_hashtags()
cleaner.extract_usernames()
# Here is a sample of the result dataset:
clean_tweets = cleaner.get_result()
clean_tweets = clean_tweets.reset_index()
clean_tweets
| index | date | content | keyword | content_clean | hashtags | usernames | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 2022-04-17 | #love #toystory #trendy #onlineshop Green Toys... | #ToyStory | [love, toystory, trendy, onlineshop, green, to... | [love, toystory, trendy, onlineshop] | [] |
| 1 | 1 | 2022-04-17 | @lolaindigoarg @Los40 @Del40al1_es Voto por #T... | #ToyStory | [lolaindigoarg, los, del, es, voto, por, toyst... | [ToyStory, Del40al1CocaCola, Mivoto40LolaIndigo] | [lolaindigoarg, Los40, Del40al1_es, Los40, Del... |
| 2 | 2 | 2022-04-17 | Trying to be #Patient when coming to #Walmart ... | #ToyStory | [trying, patient, coming, walmart, girl, neede... | [Patient, Walmart, patience, humility, empathy... | [] |
| 3 | 3 | 2022-04-17 | Jessie was absolutely enthralled by her mini-m... | #ToyStory | [jessie, absolutely, enthralled, mini, pixarpi... | [pixarpier, toystory, disneycaliforniaadventur... | [] |
| 4 | 4 | 2022-04-17 | @lolaindigoarg @entreotroscien @Los40 @Del40al... | #ToyStory | [lolaindigoarg, entreotroscien, los, del, es, ... | [ToyStory, Del40al1CocaCola, Mivoto40LolaIndigo] | [lolaindigoarg, entreotroscien, Los40, Del40al... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 757833 | 784133 | 2022-05-01 | didn't know martin was this funny #yousocrazy | #YouSoCrazy | [know, martin, funny, yousocrazy] | [yousocrazy] | [] |
| 757834 | 784134 | 2022-05-10 | #YouSoCrazy, Know where there is a 64 Chevelle... | #YouSoCrazy | [yousocrazy, know, chevelle, speed, power, win... | [YouSoCrazy,, USA, SgtHeiligUSMC] | [jeffdunham] |
| 757835 | 784135 | 2022-05-14 | #YouSoCrazy Remember the Clipper Chip I spok... | #YouSoCrazy | [yousocrazy, remember, clipper, chip, spoke, c... | [YouSoCrazy, ThatWasp, USA, USMC] | [elonmusk] |
| 757836 | 784136 | 2022-05-18 | @DAJustAsking @original1ny @Road25th @MrChaseb... | #YouSoCrazy | [dajustasking, original, road, mrchasebooth, r... | [YouSoCrazy] | [DAJustAsking, original1ny, Road25th, MrChaseb... |
| 757837 | 784137 | 2022-05-30 | HAPPY@MEMORIAL DAY \nGO STREAM MY NEW ALBUM #Y... | #YouSoCrazy | [happy, memorial, day, stream, new, album, you... | [YOUSOCRAZY] | [] |
757838 rows × 7 columns
In this section we perform the sentiment analysis with the text of each twitter. A positive, neutral, and negative sentiment will be assigned to each tweet. The vaderSentiment library will be used to calculate the sentiment. This package returns a value between -1 and 1 to refer to the sentiment.
# We create an instance of the class and calculate the sentiment
sentiment = SentimentAnalyzer(clean_tweets)
sentiment.get_sentiments()
tweets_with_sentiment = sentiment.get_result()
# Here we see the results in a new column
tweets_with_sentiment
| index | date | content | keyword | content_clean | hashtags | usernames | sentiment_score | sentiment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2022-04-17 | #love #toystory #trendy #onlineshop Green Toys... | #ToyStory | [love, toystory, trendy, onlineshop, green, to... | [love, toystory, trendy, onlineshop] | [] | 0.6369 | positive |
| 1 | 1 | 2022-04-17 | @lolaindigoarg @Los40 @Del40al1_es Voto por #T... | #ToyStory | [lolaindigoarg, los, del, es, voto, por, toyst... | [ToyStory, Del40al1CocaCola, Mivoto40LolaIndigo] | [lolaindigoarg, Los40, Del40al1_es, Los40, Del... | 0.0000 | neutral |
| 2 | 2 | 2022-04-17 | Trying to be #Patient when coming to #Walmart ... | #ToyStory | [trying, patient, coming, walmart, girl, neede... | [Patient, Walmart, patience, humility, empathy... | [] | 0.9861 | positive |
| 3 | 3 | 2022-04-17 | Jessie was absolutely enthralled by her mini-m... | #ToyStory | [jessie, absolutely, enthralled, mini, pixarpi... | [pixarpier, toystory, disneycaliforniaadventur... | [] | 0.0000 | neutral |
| 4 | 4 | 2022-04-17 | @lolaindigoarg @entreotroscien @Los40 @Del40al... | #ToyStory | [lolaindigoarg, entreotroscien, los, del, es, ... | [ToyStory, Del40al1CocaCola, Mivoto40LolaIndigo] | [lolaindigoarg, entreotroscien, Los40, Del40al... | 0.0000 | neutral |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 757833 | 784133 | 2022-05-01 | didn't know martin was this funny #yousocrazy | #YouSoCrazy | [know, martin, funny, yousocrazy] | [yousocrazy] | [] | 0.5228 | positive |
| 757834 | 784134 | 2022-05-10 | #YouSoCrazy, Know where there is a 64 Chevelle... | #YouSoCrazy | [yousocrazy, know, chevelle, speed, power, win... | [YouSoCrazy,, USA, SgtHeiligUSMC] | [jeffdunham] | 0.8453 | positive |
| 757835 | 784135 | 2022-05-14 | #YouSoCrazy Remember the Clipper Chip I spok... | #YouSoCrazy | [yousocrazy, remember, clipper, chip, spoke, c... | [YouSoCrazy, ThatWasp, USA, USMC] | [elonmusk] | -0.6734 | negative |
| 757836 | 784136 | 2022-05-18 | @DAJustAsking @original1ny @Road25th @MrChaseb... | #YouSoCrazy | [dajustasking, original, road, mrchasebooth, r... | [YouSoCrazy] | [DAJustAsking, original1ny, Road25th, MrChaseb... | 0.0000 | neutral |
| 757837 | 784137 | 2022-05-30 | HAPPY@MEMORIAL DAY \nGO STREAM MY NEW ALBUM #Y... | #YouSoCrazy | [happy, memorial, day, stream, new, album, you... | [YOUSOCRAZY] | [] | 0.0000 | neutral |
757838 rows × 9 columns
sentiment.mean_sentiment_by_keyword()
mean_sentiments = sentiment.get_result()
mean_sentiments
| keyword | sentiment_score | |
|---|---|---|
| 0 | #'TilThereWasYou | 0.000000 |
| 1 | #1-900 | 0.057174 |
| 2 | #101Dalmatians | 0.259444 |
| 3 | #12AngryMen | 0.073759 |
| 4 | #187 | -0.048073 |
| ... | ... | ... |
| 1371 | #YoungFrankenstein | 0.284343 |
| 1372 | #YoungGuns | 0.284644 |
| 1373 | #YoungGunsII | -0.065944 |
| 1374 | #ZeusandRoxanne | 0.000000 |
| 1375 | #unknown | 0.272861 |
1376 rows × 2 columns
In a recommendation engine like Netflix or MovieLens, there is a user base and a catalogue of products (movies for the above two systems). Given that every user has given some products in the system a rating, we would want to estimate how they would rate the goods they have not yet given a rating for so that we may provide them suggestions. In this instance, a matrix may be used to represent all the data we know about the current ratings.
ratings.head()
| user_id | item_id | rating | timestamp | |
|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 881250949 |
| 1 | 186 | 302 | 3 | 891717742 |
| 2 | 22 | 377 | 1 | 878887116 |
| 3 | 244 | 51 | 2 | 880606923 |
| 4 | 166 | 346 | 1 | 886397596 |
The rationale for utilising matrix factorization to address this issue is that there ought to be some latent characteristics that affect how a user evaluates a product. For instance, if two users share a preference for action movies, they could both give a certain movie high ratings if they enjoy the stars or actresses in it. Because the attributes connected with the user and the object should correspond, if we can uncover these latent traits, we should be able to predict a rating with regard to a certain user and an item. In our quest to identify the various features, we also operate under the presumption that there are fewer features overall than there are users and items. Given that it would be illogical to assume that each user is connected to a specific feature, this assumption should not be difficult to grasp (although this is not impossible). In any event, there would be no use in offering suggestions if this were the case because none of these people would be interested in the things that other users have rated. The same justification also holds true for the products.
Firstly, we have a set U of users, and a set D of items. Let R of size |U| X |D| be the matrix that contains all the ratings that the users have assigned to the items. Also, we assume that we would like to discover K latent features. Our task, then, is to find two matrics matrices P (a |U| X K matrix) and Q (a |D| X K matrix) such that their product approximates R
We now need to figure out how to get P and Q. One method of solving this issue is to initialise the two matrices with certain values, determine how "different" the products of the two are from "mathbf M," and then attempt to reduce this difference repeatedly. This approach, known as gradient descent, seeks to identify a local minimum of the difference. The following equation may be used to determine this difference, which is typically referred to as the error between the estimated rating and the true rating, for each user-item pair:
Knowing which way to change the values of p and q will help us reduce mistake. To differentiate the aforementioned equation with regard to these two variables independently, we state that we need to know the gradient at the present values:
we can now create the update rules for both pik and qkj after obtaining the gradient:
Here, α is a constant whose value determines the rate of approaching the minimum. Usually we will choose a small value for α, say 0.0002. This is because if we make too large a step towards the minimum we may run into the risk of missing the minimum and end up oscillating around the minimum. The process can then be carried out again until the error converges to its minimum using the aforementioned updating rules. The following equation may be used to compute the total error and help us decide whether to halt the procedure.
Regularization is a frequent addition to this fundamental approach to prevent overfitting. To do this, include the parameter beta and change the squared error as follows:
pivot_ratings = pd.pivot_table(ratings, index='user_id', columns='item_id', values='rating', fill_value=0)
pivot_ratings_numpy=np.array(pivot_ratings)
pivot_ratings
| item_id | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 1673 | 1674 | 1675 | 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | 1682 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 1 | 5 | 3 | 4 | 3 | 3 | 5 | 4 | 1 | 5 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 4 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 939 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 940 | 0 | 0 | 0 | 2 | 0 | 0 | 4 | 5 | 3 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 941 | 5 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 942 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 943 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
943 rows × 1682 columns
def matrix_factorization(R, P, Q, K, steps=20, alpha=0.002, beta=0.02):
'''
R: rating matrix
P: |U| * K (User features matrix)
Q: |D| * K (Item features matrix)
K: latent features
steps: iterations
alpha: learning rate
beta: regularization parameter'''
Q = Q.T
for step in range(steps):
for i in range(len(R)):
for j in range(len(R[i])):
if R[i][j] > 0:
# calculate error
eij = R[i][j] - np.dot(P[i,:],Q[:,j])
for k in range(K):
# calculate gradient with a and beta parameter
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
eR = np.dot(P,Q)
e = 0
for i in range(len(R)):
for j in range(len(R[i])):
if R[i][j] > 0:
e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
for k in range(K):
e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
# 0.001: local minimum
if e < 0.001:
break
return P, Q.T
from sklearn.metrics import mean_squared_error
import seaborn as sns
R = pivot_ratings_numpy
# N: num of User
N = len(pivot_ratings_numpy)
# M: num of Movie
M = len(pivot_ratings_numpy[0])
# Num of Features
ks = []
errors = []
for i in range(20,130,10):
K = i
print(K)
P = np.random.rand(N,K)
Q = np.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)
error = np.sqrt(mean_squared_error(nR,pivot_ratings_numpy))
ks.append(K)
errors.append(error)
20 30 40 50 60 70 80 90 100 110 120
results_df = pd.DataFrame(list(zip(ks, errors)),
columns =['Ks', 'Error'])
sns.lineplot(data=results_df, x="Ks", y="Error")
<AxesSubplot:xlabel='Ks', ylabel='Error'>
K = 20
print(K)
P = np.random.rand(N,K)
Q = np.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)
error = np.sqrt(mean_squared_error(nR,pivot_ratings_numpy))
ks.append(K)
errors.append(error)
20
nP
array([[ 0.03648385, -0.02613549, 0.90381208, ..., 0.31513689,
0.77038106, 0.60413345],
[ 0.31284609, 0.41417456, 0.02151071, ..., 0.71833285,
0.59297537, 0.16167853],
[ 0.55417461, 0.40020932, 0.68054434, ..., 0.34988877,
0.52498041, 0.22562116],
...,
[ 0.08043566, 0.37980732, 0.69904216, ..., 0.28320006,
0.3645188 , 0.96652557],
[ 0.93805764, 0.58563568, 0.43103136, ..., 0.30787102,
0.62253793, 0.01026524],
[ 1.01270474, 0.1601151 , 0.07925304, ..., -0.0452616 ,
0.64324258, 0.53898052]])
nQ
array([[ 0.50115123, 0.42308078, 0.38454792, ..., 0.46171081,
0.44037719, 0.50532643],
[ 0.46904987, 0.20339276, 0.24911969, ..., 0.24497574,
0.14785684, 0.35058309],
[-0.13286523, 0.36684465, 0.42655599, ..., 0.68465855,
0.53722062, 0.29784699],
...,
[ 0.2771404 , 0.56385159, 0.21315201, ..., 0.86497975,
0.79489993, 0.37211599],
[ 0.67348797, 0.58265664, 0.56089141, ..., 0.32326911,
0.03929956, 0.84195904],
[ 0.4539293 , 0.02085084, 0.91456539, ..., 0.07665726,
0.42815704, 0.09118767]])
nR
array([[3.87909017, 3.36820967, 3.68519487, ..., 3.21041113, 3.95672512,
4.21884039],
[4.35167391, 3.65556958, 3.52968052, ..., 4.07938811, 4.43574577,
3.7527512 ],
[2.99864371, 2.07448371, 3.03353572, ..., 3.17242923, 3.47181869,
4.36621268],
...,
[4.26467684, 3.42247284, 3.79048279, ..., 3.82435845, 4.86786801,
4.93202147],
[4.31068699, 3.61153832, 3.01315414, ..., 3.87808803, 4.84149733,
4.78158843],
[3.86437803, 3.55591682, 2.59046917, ..., 3.16865349, 4.36711078,
3.56571529]])
nR_df=pd.DataFrame(nR, columns=list(pivot_ratings.columns))
nR_df.index = np.arange(1, len(nR_df)+1)
nR_df
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 1673 | 1674 | 1675 | 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | 1682 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 3.879090 | 3.368210 | 3.685195 | 3.459717 | 3.054653 | 4.339194 | 3.916772 | 4.036979 | 4.224187 | 3.586611 | ... | 3.079818 | 3.282552 | 3.663457 | 4.081385 | 2.923132 | 2.430859 | 5.585002 | 3.210411 | 3.956725 | 4.218840 |
| 2 | 4.351674 | 3.655570 | 3.529681 | 3.112529 | 2.940769 | 3.488236 | 3.900049 | 3.806634 | 3.295875 | 3.655257 | ... | 3.924963 | 3.786994 | 4.260260 | 3.632338 | 2.741857 | 2.485698 | 4.676345 | 4.079388 | 4.435746 | 3.752751 |
| 3 | 2.998644 | 2.074484 | 3.033536 | 2.473550 | 3.215418 | 3.962115 | 3.597399 | 3.452422 | 3.867059 | 3.544624 | ... | 2.843240 | 3.725357 | 3.376004 | 2.752446 | 3.009393 | 2.157853 | 4.414994 | 3.172429 | 3.471819 | 4.366213 |
| 4 | 5.098304 | 4.145585 | 4.375619 | 4.196334 | 4.142603 | 4.376732 | 4.885350 | 4.871924 | 4.890032 | 4.352509 | ... | 4.366878 | 4.948847 | 4.774750 | 5.248740 | 4.157176 | 3.093158 | 6.263922 | 4.666183 | 5.389383 | 5.119397 |
| 5 | 3.250994 | 2.861887 | 2.870367 | 3.223586 | 2.576179 | 3.674511 | 3.586622 | 3.162714 | 3.191373 | 3.515218 | ... | 2.894107 | 3.269883 | 3.036352 | 3.196659 | 2.239685 | 2.210504 | 4.620769 | 2.826576 | 3.191855 | 3.152520 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 939 | 5.421894 | 4.712091 | 4.189260 | 5.513291 | 4.499832 | 5.297310 | 5.020106 | 5.583102 | 5.526359 | 5.065142 | ... | 4.157459 | 4.875182 | 4.707999 | 5.463039 | 3.803949 | 3.209369 | 6.921711 | 4.295340 | 6.027965 | 4.895784 |
| 940 | 3.987095 | 3.031427 | 2.896333 | 2.949058 | 3.530902 | 3.032262 | 3.976721 | 3.740376 | 3.449969 | 3.222025 | ... | 3.132115 | 3.253186 | 3.725911 | 2.881612 | 2.712842 | 2.388651 | 3.875640 | 2.997745 | 4.002724 | 3.595566 |
| 941 | 4.264677 | 3.422473 | 3.790483 | 3.815380 | 3.824696 | 4.414338 | 4.241572 | 4.556675 | 4.671044 | 4.209612 | ... | 3.334631 | 4.043024 | 3.927909 | 4.446004 | 3.639549 | 2.755207 | 5.953902 | 3.824358 | 4.867868 | 4.932021 |
| 942 | 4.310687 | 3.611538 | 3.013154 | 3.853890 | 3.977515 | 3.920999 | 4.893990 | 4.344416 | 4.186759 | 4.697188 | ... | 3.983234 | 4.347384 | 4.795409 | 3.701891 | 3.339597 | 2.729478 | 5.232974 | 3.878088 | 4.841497 | 4.781588 |
| 943 | 3.864378 | 3.555917 | 2.590469 | 3.799089 | 3.867969 | 3.601631 | 3.272937 | 4.241627 | 2.914695 | 4.207372 | ... | 3.286477 | 3.364702 | 3.565947 | 3.699203 | 3.266083 | 2.242691 | 3.995128 | 3.168653 | 4.367111 | 3.565715 |
943 rows × 1682 columns
pivot_ratings
| item_id | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 1673 | 1674 | 1675 | 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | 1682 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 1 | 5 | 3 | 4 | 3 | 3 | 5 | 4 | 1 | 5 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 4 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 939 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 940 | 0 | 0 | 0 | 2 | 0 | 0 | 4 | 5 | 3 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 941 | 5 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 942 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 943 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
943 rows × 1682 columns
Making a deep learning model for a hybrid recommendation system is the aim. We'll get a lot of help with it from embedding layers (basically a layer that maps an index to a vector of trainable weights). The Neural Network that we implemented is the following:
movies_info.head()
| movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | ... | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | keyword | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ToyStory |
| 1 | 2 | GoldenEye (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?GoldenEye%20(... | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | GoldenEye |
| 2 | 3 | Four Rooms (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Four%20Rooms%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | FourRooms |
| 3 | 4 | Get Shorty (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Get%20Shorty%... | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GetShorty |
| 4 | 5 | Copycat (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Copycat%20(1995) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Copycat |
5 rows × 25 columns
mean_sentiments['keyword']=mean_sentiments['keyword'].apply(lambda x: x.replace('#', ''))
mean_sentiments
| keyword | sentiment_score | |
|---|---|---|
| 0 | 'TilThereWasYou | 0.000000 |
| 1 | 1-900 | 0.057174 |
| 2 | 101Dalmatians | 0.259444 |
| 3 | 12AngryMen | 0.073759 |
| 4 | 187 | -0.048073 |
| ... | ... | ... |
| 1371 | YoungFrankenstein | 0.284343 |
| 1372 | YoungGuns | 0.284644 |
| 1373 | YoungGunsII | -0.065944 |
| 1374 | ZeusandRoxanne | 0.000000 |
| 1375 | unknown | 0.272861 |
1376 rows × 2 columns
Basically. What we need is to get a dataframe that contains all the features of the movies: Genre, Score from the users and the sentiment score that we got from the tweets. For that, we need to merge the movies_info, mean_sentiments and ratings dataframe
movies_info = movies_info.merge(mean_sentiments, on='keyword', how='left')
movies_info
| movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | ... | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | keyword | sentiment_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 |
| 1 | 2 | GoldenEye (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?GoldenEye%20(... | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | GoldenEye | 0.236044 |
| 2 | 3 | Four Rooms (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Four%20Rooms%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | FourRooms | 0.118717 |
| 3 | 4 | Get Shorty (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Get%20Shorty%... | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GetShorty | 0.077233 |
| 4 | 5 | Copycat (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Copycat%20(1995) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Copycat | 0.144983 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1677 | 1678 | Mat' i syn (1997) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?Mat%27+i+syn+... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Mat'isyn | 0.253600 |
| 1678 | 1679 | B. Monkey (1998) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?B%2E+Monkey+(... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | B.Monkey | -0.043922 |
| 1679 | 1680 | Sliding Doors (1998) | 01-Jan-1998 | NaN | http://us.imdb.com/Title?Sliding+Doors+(1998) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | SlidingDoors | 0.351519 |
| 1680 | 1681 | You So Crazy (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?You%20So%20Cr... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | YouSoCrazy | -0.036957 |
| 1681 | 1682 | Scream of Stone (Schrei aus Stein) (1991) | 08-Mar-1996 | NaN | http://us.imdb.com/M/title-exact?Schrei%20aus%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ScreamofStone | NaN |
1682 rows × 26 columns
ratings = ratings.rename(columns={'item_id': 'movie_id'})
ratings
| user_id | movie_id | rating | timestamp | |
|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 881250949 |
| 1 | 186 | 302 | 3 | 891717742 |
| 2 | 22 | 377 | 1 | 878887116 |
| 3 | 244 | 51 | 2 | 880606923 |
| 4 | 166 | 346 | 1 | 886397596 |
| ... | ... | ... | ... | ... |
| 99995 | 880 | 476 | 3 | 880175444 |
| 99996 | 716 | 204 | 5 | 879795543 |
| 99997 | 276 | 1090 | 1 | 874795795 |
| 99998 | 13 | 225 | 2 | 882399156 |
| 99999 | 12 | 203 | 3 | 879959583 |
100000 rows × 4 columns
movies_info
| movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | ... | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | keyword | sentiment_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 |
| 1 | 2 | GoldenEye (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?GoldenEye%20(... | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | GoldenEye | 0.236044 |
| 2 | 3 | Four Rooms (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Four%20Rooms%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | FourRooms | 0.118717 |
| 3 | 4 | Get Shorty (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Get%20Shorty%... | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | GetShorty | 0.077233 |
| 4 | 5 | Copycat (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Copycat%20(1995) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Copycat | 0.144983 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1677 | 1678 | Mat' i syn (1997) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?Mat%27+i+syn+... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Mat'isyn | 0.253600 |
| 1678 | 1679 | B. Monkey (1998) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?B%2E+Monkey+(... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | B.Monkey | -0.043922 |
| 1679 | 1680 | Sliding Doors (1998) | 01-Jan-1998 | NaN | http://us.imdb.com/Title?Sliding+Doors+(1998) | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | SlidingDoors | 0.351519 |
| 1680 | 1681 | You So Crazy (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?You%20So%20Cr... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | YouSoCrazy | -0.036957 |
| 1681 | 1682 | Scream of Stone (Schrei aus Stein) (1991) | 08-Mar-1996 | NaN | http://us.imdb.com/M/title-exact?Schrei%20aus%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ScreamofStone | NaN |
1682 rows × 26 columns
#movies_info=movies_info.drop('_merge',axis = 1)
full_df2 = movies_info.merge(ratings, on='movie_id', how='left')
full_df2
| movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | ... | Romance | sci_fi | Thriller | War | Western | keyword | sentiment_score | user_id | rating | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 | 308 | 4 | 887736532 |
| 1 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 | 287 | 5 | 875334088 |
| 2 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 | 148 | 4 | 877019411 |
| 3 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 | 280 | 4 | 891700426 |
| 4 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | ToyStory | 0.172849 | 66 | 3 | 883601324 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 1678 | Mat' i syn (1997) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?Mat%27+i+syn+... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | Mat'isyn | 0.253600 | 863 | 1 | 889289570 |
| 99996 | 1679 | B. Monkey (1998) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?B%2E+Monkey+(... | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | B.Monkey | -0.043922 | 863 | 3 | 889289491 |
| 99997 | 1680 | Sliding Doors (1998) | 01-Jan-1998 | NaN | http://us.imdb.com/Title?Sliding+Doors+(1998) | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | SlidingDoors | 0.351519 | 863 | 2 | 889289570 |
| 99998 | 1681 | You So Crazy (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?You%20So%20Cr... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | YouSoCrazy | -0.036957 | 896 | 3 | 887160722 |
| 99999 | 1682 | Scream of Stone (Schrei aus Stein) (1991) | 08-Mar-1996 | NaN | http://us.imdb.com/M/title-exact?Schrei%20aus%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | ScreamofStone | NaN | 916 | 3 | 880845755 |
100000 rows × 29 columns
full_df2.to_csv('final_df2.csv')
full_df = ratings.merge(movies_info, on='movie_id', how='left')
full_df
| user_id | movie_id | rating | timestamp | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | ... | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | keyword | sentiment_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 881250949 | Kolya (1996) | 24-Jan-1997 | NaN | http://us.imdb.com/M/title-exact?Kolya%20(1996) | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Kolya | -0.015440 |
| 1 | 186 | 302 | 3 | 891717742 | L.A. Confidential (1997) | 01-Jan-1997 | NaN | http://us.imdb.com/M/title-exact?L%2EA%2E+Conf... | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | L.A.Confidential | 0.161026 |
| 2 | 22 | 377 | 1 | 878887116 | Heavyweights (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?Heavyweights%... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Heavyweights | 0.103676 |
| 3 | 244 | 51 | 2 | 880606923 | Legends of the Fall (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?Legends%20of%... | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | LegendsoftheFall | 0.226147 |
| 4 | 166 | 346 | 1 | 886397596 | Jackie Brown (1997) | 01-Jan-1997 | NaN | http://us.imdb.com/M/title-exact?imdb-title-11... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | JackieBrown | 0.227223 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 880 | 476 | 3 | 880175444 | First Wives Club, The (1996) | 14-Sep-1996 | NaN | http://us.imdb.com/M/title-exact?First%20Wives... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | FirstWivesClub,The | 0.425364 |
| 99996 | 716 | 204 | 5 | 879795543 | Back to the Future (1985) | 01-Jan-1985 | NaN | http://us.imdb.com/M/title-exact?Back%20to%20t... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | BacktotheFuture | 0.222046 |
| 99997 | 276 | 1090 | 1 | 874795795 | Sliver (1993) | 01-Jan-1993 | NaN | http://us.imdb.com/M/title-exact?Sliver%20(1993) | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Sliver | 0.246469 |
| 99998 | 13 | 225 | 2 | 882399156 | 101 Dalmatians (1996) | 27-Nov-1996 | NaN | http://us.imdb.com/M/title-exact?101%20Dalmati... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 101Dalmatians | 0.259444 |
| 99999 | 12 | 203 | 3 | 879959583 | Unforgiven (1992) | 01-Jan-1992 | NaN | http://us.imdb.com/M/title-exact?Unforgiven%20... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Unforgiven | 0.126658 |
100000 rows × 29 columns
full_df.to_csv('final_df1.csv')
full_df["sentiment_score"].fillna("0", inplace = True)
full_df = full_df.drop(['keyword'], axis=1)
full_df
| user_id | movie_id | rating | timestamp | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | ... | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | sentiment_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 881250949 | Kolya (1996) | 24-Jan-1997 | NaN | http://us.imdb.com/M/title-exact?Kolya%20(1996) | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -0.01544 |
| 1 | 186 | 302 | 3 | 891717742 | L.A. Confidential (1997) | 01-Jan-1997 | NaN | http://us.imdb.com/M/title-exact?L%2EA%2E+Conf... | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.161026 |
| 2 | 22 | 377 | 1 | 878887116 | Heavyweights (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?Heavyweights%... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.103676 |
| 3 | 244 | 51 | 2 | 880606923 | Legends of the Fall (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?Legends%20of%... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0.226147 |
| 4 | 166 | 346 | 1 | 886397596 | Jackie Brown (1997) | 01-Jan-1997 | NaN | http://us.imdb.com/M/title-exact?imdb-title-11... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.227223 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 880 | 476 | 3 | 880175444 | First Wives Club, The (1996) | 14-Sep-1996 | NaN | http://us.imdb.com/M/title-exact?First%20Wives... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.425364 |
| 99996 | 716 | 204 | 5 | 879795543 | Back to the Future (1985) | 01-Jan-1985 | NaN | http://us.imdb.com/M/title-exact?Back%20to%20t... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.222046 |
| 99997 | 276 | 1090 | 1 | 874795795 | Sliver (1993) | 01-Jan-1993 | NaN | http://us.imdb.com/M/title-exact?Sliver%20(1993) | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.246469 |
| 99998 | 13 | 225 | 2 | 882399156 | 101 Dalmatians (1996) | 27-Nov-1996 | NaN | http://us.imdb.com/M/title-exact?101%20Dalmati... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.259444 |
| 99999 | 12 | 203 | 3 | 879959583 | Unforgiven (1992) | 01-Jan-1992 | NaN | http://us.imdb.com/M/title-exact?Unforgiven%20... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.126658 |
100000 rows × 28 columns
full_df.sort_values('movie_id')
| user_id | movie_id | rating | timestamp | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | ... | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | sentiment_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25741 | 84 | 1 | 2 | 883452108 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.172849 |
| 93639 | 806 | 1 | 4 | 882385082 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.172849 |
| 55726 | 768 | 1 | 5 | 883835025 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.172849 |
| 49529 | 92 | 1 | 4 | 875810511 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.172849 |
| 89079 | 419 | 1 | 4 | 879435590 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.172849 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 75323 | 863 | 1678 | 1 | 889289570 | Mat' i syn (1997) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?Mat%27+i+syn+... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.2536 |
| 67302 | 863 | 1679 | 3 | 889289491 | B. Monkey (1998) | 06-Feb-1998 | NaN | http://us.imdb.com/M/title-exact?B%2E+Monkey+(... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | -0.043922 |
| 80394 | 863 | 1680 | 2 | 889289570 | Sliding Doors (1998) | 01-Jan-1998 | NaN | http://us.imdb.com/Title?Sliding+Doors+(1998) | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0.351519 |
| 92329 | 896 | 1681 | 3 | 887160722 | You So Crazy (1994) | 01-Jan-1994 | NaN | http://us.imdb.com/M/title-exact?You%20So%20Cr... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -0.036957 |
| 95376 | 916 | 1682 | 3 | 880845755 | Scream of Stone (Schrei aus Stein) (1991) | 08-Mar-1996 | NaN | http://us.imdb.com/M/title-exact?Schrei%20aus%... | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
100000 rows × 28 columns
After dropping all the unnecesary features from the dataframes. we got the following dataframe
full_df_clean = full_df.drop(['timestamp','release_date','video_release_date','IMDb_URL','movie_title'], axis=1)
full_df_clean
| user_id | movie_id | rating | unknown | Action | Adventure | Animation | Children | Comedy | Crime | ... | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | sentiment_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 196 | 242 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -0.01544 |
| 1 | 186 | 302 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.161026 |
| 2 | 22 | 377 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.103676 |
| 3 | 244 | 51 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0.226147 |
| 4 | 166 | 346 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.227223 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 880 | 476 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.425364 |
| 99996 | 716 | 204 | 5 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.222046 |
| 99997 | 276 | 1090 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.246469 |
| 99998 | 13 | 225 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.259444 |
| 99999 | 12 | 203 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.126658 |
100000 rows × 23 columns
full_df_clean.columns
Index(['user_id', 'movie_id', 'rating', 'unknown', 'Action', 'Adventure',
'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
'Fantasy', 'Film-noir', 'Horror', 'Musical', 'Mystery', 'Romance',
'sci_fi', 'Thriller', 'War', 'Western', 'sentiment_score'],
dtype='object')
full_df_clean = full_df_clean[['rating', 'user_id', 'movie_id', 'sentiment_score', 'unknown', 'Action', 'Adventure',
'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
'Fantasy', 'Film-noir', 'Horror', 'Musical', 'Mystery', 'Romance',
'sci_fi', 'Thriller', 'War', 'Western']]
full_df_clean
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 196 | 242 | -0.01544 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 3 | 186 | 302 | 0.161026 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 2 | 1 | 22 | 377 | 0.103676 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 2 | 244 | 51 | 0.226147 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
| 4 | 1 | 166 | 346 | 0.227223 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 3 | 880 | 476 | 0.425364 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 99996 | 5 | 716 | 204 | 0.222046 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 99997 | 1 | 276 | 1090 | 0.246469 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 99998 | 2 | 13 | 225 | 0.259444 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 99999 | 3 | 12 | 203 | 0.126658 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
100000 rows × 23 columns
Then. We split the data into train and validation using 70% for trainning purposes and 30% for validaiton
train = full_df_clean.iloc[:70000]
validation = full_df_clean.iloc[70000:]
train
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 196 | 242 | -0.01544 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 3 | 186 | 302 | 0.161026 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 2 | 1 | 22 | 377 | 0.103676 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 2 | 244 | 51 | 0.226147 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
| 4 | 1 | 166 | 346 | 0.227223 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 69995 | 4 | 91 | 1192 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 69996 | 2 | 788 | 174 | 0.157787 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 69997 | 4 | 62 | 134 | 0.197007 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 69998 | 3 | 699 | 762 | 0.543092 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 69999 | 2 | 561 | 379 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
70000 rows × 23 columns
validation
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 70000 | 5 | 338 | 517 | 0.177521 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 70001 | 3 | 290 | 234 | 0.165306 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 70002 | 3 | 458 | 762 | 0.543092 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 70003 | 3 | 524 | 582 | 0.39034 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 70004 | 3 | 530 | 176 | 0.158534 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99995 | 3 | 880 | 476 | 0.425364 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 99996 | 5 | 716 | 204 | 0.222046 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 99997 | 1 | 276 | 1090 | 0.246469 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 99998 | 2 | 13 | 225 | 0.259444 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 99999 | 3 | 12 | 203 | 0.126658 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
30000 rows × 23 columns
train_np = train.to_numpy()
validation_np = validation.to_numpy()
train.head()
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 196 | 242 | -0.01544 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 3 | 186 | 302 | 0.161026 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 2 | 1 | 22 | 377 | 0.103676 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 2 | 244 | 51 | 0.226147 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
| 4 | 1 | 166 | 346 | 0.227223 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 23 columns
from torch import nn
import torch
from torch.nn import functional as F
import torch.optim as optim
if torch.cuda.is_available():
dev = "cuda:0"
print("gpu up")
else:
dev = "cpu"
device = torch.device(dev)
movies_info['movie_id'].max()
1682
Builing the architecture of the neural network
class RecommendationNet(nn.Module):
def __init__(self,latent_vector_size):
super(RecommendationNet,self).__init__()
self.l_v_s = latent_vector_size
self.users = nn.Embedding(100000,self.l_v_s,sparse = True)
self.movies = nn.Embedding(1683,self.l_v_s,sparse = True)
self.linear1 = nn.Linear(self.l_v_s+self.l_v_s+20,64)
self.linear2 = nn.Dropout(p=0.2)
self.linear3 = nn.Linear(64,32)
self.linear4 = nn.Linear(32,1)
def forward(self,x):
user = x[:,0].long() # here am selecting the user and anime ids from the input
movie = x[:,1].long()
otherfeatures = x[:,2:]
userVector = self.users(user)
movieVector = self.movies(movie)
layer1 = torch.cat((userVector,movieVector,otherfeatures),1)# concatenating vectors
layer2 = F.relu(self.linear1(layer1))
layer3 = F.relu(self.linear2(layer2))
layer4 = F.relu(self.linear3(layer3))
out = torch.sigmoid(self.linear4(layer4))
return out
myNN = RecommendationNet(90)
myNN.to(device)
RecommendationNet( (users): Embedding(100000, 90, sparse=True) (movies): Embedding(1683, 90, sparse=True) (linear1): Linear(in_features=200, out_features=64, bias=True) (linear2): Dropout(p=0.2, inplace=False) (linear3): Linear(in_features=64, out_features=32, bias=True) (linear4): Linear(in_features=32, out_features=1, bias=True) )
There are some hyperparameters to tunne to get a better model. Such as: the latent vector size and the number of epochs. To know that. We iterated with different latent vector size and we compared the errors
optimizer = optim.Adagrad(myNN.parameters(),lr = 0.001)
train_np = train.to_numpy()
batch_size = 64
npData = train_np
npData[:,:1] = npData[:,:1]/10
# traintrues = np.expm1(npData[:,4].reshape(-1,1)).reshape(-1)
def ceil(a,b):
return -(-a//b)
n_samples = len(npData)
better_batch_size = ceil(n_samples, ceil(n_samples, batch_size))
mses_v = []
vector_size = []
for j in range(20,130,10):
myNN = RecommendationNet(j)
myNN.to(device)
for i in range(20):
# preds=[]
for i in range(ceil(n_samples, better_batch_size)):
batch = npData[i * better_batch_size: (i+1) * better_batch_size]
batch = torch.Tensor(batch.astype(np.float32)).to(device)
X = batch[:,1:]
y = batch[:,:1]
myNN.zero_grad()
pred = myNN(X)
# preds.extend(np.expm1(pred.cpu().detach().numpy()).reshape(-1))
err = F.mse_loss(pred,y)
err.backward()
optimizer.step()
#print(torch.sqrt(err))
valpreds = myNN(torch.Tensor(validation_np.astype(np.float32)[:,1:]).to(device)).cpu().detach().numpy().reshape(-1)
mse = np.sqrt(mean_squared_error(validation.rating.to_numpy(),valpreds*10))
#print("Validation Error: ",mse)
mses_v.append(mse)
vector_size.append(j)
print(f'Vector Size: {j}, MSE: {mse}')
Vector Size: 20, MSE: 1.5230261396833948 Vector Size: 30, MSE: 1.6319225032190623 Vector Size: 40, MSE: 2.097081441736489 Vector Size: 50, MSE: 1.9112220946191996 Vector Size: 60, MSE: 1.8562537275341207 Vector Size: 70, MSE: 2.198038910830882 Vector Size: 80, MSE: 2.0505509564020485 Vector Size: 90, MSE: 2.3429269976421656 Vector Size: 100, MSE: 1.6457462881067895 Vector Size: 110, MSE: 2.1315204965099217 Vector Size: 120, MSE: 1.9890552462278643
After the experiment, we concluded that the best vector size is 100
import seaborn as sns
results_df_NN = pd.DataFrame(list(zip(vector_size, mses_v)),
columns =['Vector_size', 'mses'])
sns.lineplot(data=results_df_NN, x="Vector_size", y="mses")
<AxesSubplot:xlabel='Vector_size', ylabel='mses'>
Now, the same process for the epochs.
optimizer = optim.Adagrad(myNN.parameters(),lr = 0.001)
train_np = train.to_numpy()
batch_size = 64
npData = train_np
npData[:,:1] = npData[:,:1]/10
# traintrues = np.expm1(npData[:,4].reshape(-1,1)).reshape(-1)
def ceil(a,b):
return -(-a//b)
n_samples = len(npData)
better_batch_size = ceil(n_samples, ceil(n_samples, batch_size))
print(better_batch_size)
mses = []
epochs = []
myNN = RecommendationNet(70)
myNN.to(device)
for epoch in range(300):
# preds=[]
for i in range(ceil(n_samples, better_batch_size)):
batch = npData[i * better_batch_size: (i+1) * better_batch_size]
batch = torch.Tensor(batch.astype(np.float32)).to(device)
X = batch[:,1:]
y = batch[:,:1]
myNN.zero_grad()
pred = myNN(X)
err = F.mse_loss(pred,y)
err.backward()
optimizer.step()
#print(torch.sqrt(err))
valpreds = myNN(torch.Tensor(validation_np.astype(np.float32)[:,1:]).to(device)).cpu().detach().numpy().reshape(-1)
mse = np.sqrt(mean_squared_error(validation.rating.to_numpy(),valpreds*10))
#print("Validation Error: ",mse)
mses.append(mse)
epochs.append(epoch)
if epoch%10 == 0:
print(f'MSE: {mse}, EPOCH: {epoch}')
64 MSE: 1.6634592832720538, EPOCH: 0 MSE: 1.6626294981570278, EPOCH: 10 MSE: 1.6627875939178831, EPOCH: 20 MSE: 1.66395219337277, EPOCH: 30 MSE: 1.6640860042539463, EPOCH: 40 MSE: 1.663054206524036, EPOCH: 50 MSE: 1.6632302496780746, EPOCH: 60 MSE: 1.6638191025710554, EPOCH: 70 MSE: 1.6638931140247575, EPOCH: 80 MSE: 1.6639804745716662, EPOCH: 90 MSE: 1.6630289764280122, EPOCH: 100 MSE: 1.6635090414600375, EPOCH: 110 MSE: 1.6638077995010396, EPOCH: 120 MSE: 1.6640177723144447, EPOCH: 130 MSE: 1.6639721361811273, EPOCH: 140 MSE: 1.663653751553435, EPOCH: 150 MSE: 1.663679073564936, EPOCH: 160 MSE: 1.6629316395259386, EPOCH: 170 MSE: 1.6643269908146885, EPOCH: 180 MSE: 1.663292876964108, EPOCH: 190 MSE: 1.6640763362133, EPOCH: 200 MSE: 1.6636144589682778, EPOCH: 210 MSE: 1.6633286873267812, EPOCH: 220 MSE: 1.6635292668952633, EPOCH: 230 MSE: 1.6638581493448947, EPOCH: 240 MSE: 1.6625040715552581, EPOCH: 250 MSE: 1.6637632442986203, EPOCH: 260 MSE: 1.6637370226328796, EPOCH: 270 MSE: 1.663694913848375, EPOCH: 280 MSE: 1.6645676858041543, EPOCH: 290
results_df = pd.DataFrame(list(zip(epochs, mses)),
columns =['epochs', 'mses'])
sns.lineplot(data=results_df, x="epochs", y="mses")
<AxesSubplot:xlabel='epochs', ylabel='mses'>
results_df = pd.DataFrame(list(zip(ks, errors)),
columns =['Vector_size', 'mses'])
sns.lineplot(data=results_df, x="Vector_size", y="mses")
<AxesSubplot:xlabel='Vector_size', ylabel='mses'>
results_df_NN = pd.DataFrame(list(zip(vector_size, mses_v)),
columns =['Vector_size', 'mses'])
sns.lineplot(data=results_df_NN, x="Vector_size", y="mses")
<AxesSubplot:xlabel='Vector_size', ylabel='mses'>
sns.lineplot(data=results_df, x='Vector_size', y='mses', color="red")
sns.lineplot(data=results_df_NN, x='Vector_size', y='mses')
<AxesSubplot:xlabel='Vector_size', ylabel='mses'>
First, we are going to extract the top 10 and 20 movies from the real dataset for the user_id 2. With that. we can compare the results of both of the recomender systems and calculate some evaluation metrics to display de performance of each system
real_top_20 = validation[validation.user_id == 1].sort_values(by=['rating'], ascending=False)[:20]
movies_id_top_20 = list(real_top_20['movie_id'])
real_top_20_np = real_top_20['rating'].to_numpy()
real_top_20_np_movies = real_top_20['movie_id'].to_numpy()
real_top_10 = real_top_20[:10]
real_top_10_movies = real_top_20_np_movies[:10]
real_top_10_np = real_top_10['rating'].to_numpy()
real_top_20
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 82786 | 5 | 1 | 204 | 0.222046 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 96699 | 5 | 1 | 152 | 0.213083 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 88021 | 5 | 1 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 74577 | 5 | 1 | 165 | 0.4843 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 85557 | 5 | 1 | 19 | 0.30574 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 75385 | 5 | 1 | 198 | 0.215182 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 77073 | 5 | 1 | 124 | 0.213847 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 88259 | 5 | 1 | 111 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 92487 | 5 | 1 | 172 | 0.150176 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 |
| 84793 | 5 | 1 | 207 | 0.27717 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 87967 | 5 | 1 | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 78817 | 5 | 1 | 216 | 0.313698 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 89876 | 5 | 1 | 13 | 0.6249 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 83307 | 4 | 1 | 3 | 0.118717 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 70539 | 4 | 1 | 7 | 0.146624 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 92049 | 4 | 1 | 28 | 0.130862 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 88262 | 4 | 1 | 52 | 0.347813 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 88893 | 4 | 1 | 88 | 0.148203 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 78171 | 4 | 1 | 58 | 0.271529 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 77238 | 4 | 1 | 95 | 0.234822 | 0 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
20 rows × 23 columns
The predicted rating for the top 20 movies for the user_id 2 using the hybrid recomender system are the following:
npRecommend = real_top_20.to_numpy()
npRecommend[:,0] = myNN(torch.Tensor(npRecommend.astype(np.float32)[:,1:]).to(device)).to(device).cpu().detach().numpy().reshape(-1)
recomend_NN_df = pd.DataFrame(npRecommend,columns =['rating', 'user_id', 'movie_id', 'sentiment_score', 'unknown', 'Action', 'Adventure',
'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
'Fantasy', 'Film-noir', 'Horror', 'Musical', 'Mystery', 'Romance',
'sci_fi', 'Thriller', 'War', 'Western'])
recomend_NN_df['rating']= recomend_NN_df['rating']*10
recomend_NN_top_20 = recomend_NN_df.sort_values(by=['rating'], ascending=False)
recomend_NN_top_20_np = recomend_NN_top_20['rating'].to_numpy()
recomend_NN_top_10 = recomend_NN_top_20[:10]
recomend_NN_top_10_np = recomend_NN_top_10['rating'].to_numpy()
recomend_NN_top_20
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 5.136009 | 1 | 216 | 0.313698 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 16 | 5.110055 | 1 | 52 | 0.347813 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 4.943981 | 1 | 204 | 0.222046 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 13 | 4.931236 | 1 | 3 | 0.118717 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 15 | 4.904124 | 1 | 28 | 0.130862 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 5 | 4.869394 | 1 | 198 | 0.215182 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 9 | 4.851279 | 1 | 207 | 0.27717 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14 | 4.842371 | 1 | 7 | 0.146624 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 6 | 4.818701 | 1 | 124 | 0.213847 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4.783924 | 1 | 165 | 0.4843 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 8 | 4.782398 | 1 | 172 | 0.150176 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 |
| 10 | 4.77121 | 1 | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | 4.734166 | 1 | 13 | 0.6249 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 4.722641 | 1 | 19 | 0.30574 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 4.706887 | 1 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 17 | 4.665704 | 1 | 88 | 0.148203 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7 | 4.614007 | 1 | 111 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 4.601499 | 1 | 152 | 0.213083 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 19 | 4.587865 | 1 | 95 | 0.234822 | 0 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | 4.481873 | 1 | 58 | 0.271529 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20 rows × 23 columns
The predicted rating for the top 20 movies for the user_id 2 using the Matrix Factorization recommender system are the following:
recomend_MF = nR_df.iloc[[0]].T
recomend_MF['movie_id'] = recomend_MF.index
recomend_MF = recomend_MF.reset_index(level=0)
recomend_MF_top_20 = recomend_MF[recomend_MF['movie_id'].isin(movies_id_top_20)]
recomend_MF_top_10 = recomend_MF_top_20[:10]
recomend_MF_top_20_np = recomend_MF_top_20[1].to_numpy()
recomend_MF_top_10_np = recomend_MF_top_10[1].to_numpy()
recomend_MF_top_20
| index | 1 | movie_id | |
|---|---|---|---|
| 2 | 3 | 3.685195 | 3 |
| 6 | 7 | 3.916772 | 7 |
| 12 | 13 | 4.008592 | 13 |
| 14 | 15 | 3.591299 | 15 |
| 18 | 19 | 4.500861 | 19 |
| 27 | 28 | 4.201444 | 28 |
| 51 | 52 | 4.238310 | 52 |
| 57 | 58 | 3.928805 | 58 |
| 58 | 59 | 4.339364 | 59 |
| 87 | 88 | 3.298824 | 88 |
| 94 | 95 | 3.619747 | 95 |
| 110 | 111 | 3.647951 | 111 |
| 123 | 124 | 4.287549 | 124 |
| 151 | 152 | 3.915166 | 152 |
| 164 | 165 | 4.195137 | 165 |
| 171 | 172 | 4.531975 | 172 |
| 197 | 198 | 4.012992 | 198 |
| 203 | 204 | 3.686991 | 204 |
| 206 | 207 | 3.759166 | 207 |
| 215 | 216 | 3.896622 | 216 |
real_top_20_np
array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4],
dtype=int64)
recomend_NN_top_20_np
array([5.136008858680725, 5.110055208206177, 4.943981170654297,
4.931235909461975, 4.904123842716217, 4.869393706321716,
4.851279258728027, 4.842371046543121, 4.818700551986694,
4.783923625946045, 4.782398343086243, 4.771209955215454,
4.734165966510773, 4.722641408443451, 4.70688670873642,
4.665704369544983, 4.614007472991943, 4.601499438285828,
4.587864875793457, 4.481872618198395], dtype=object)
Now, we can calculate some evaluation metrics such as RMSE,Precision, Recall, and F1-Score
The mean of the squared difference between the data set's original and forecasted values is known as the mean squared error. It calculates the residuals' variance.
The square root of Mean Squared Error is called Root Mean Squared Error. It calculates the residuals' standard deviation.
mse_top_20_MF = mean_squared_error(real_top_20_np, recomend_MF_top_20_np)
mse_top_10_MF = mean_squared_error(real_top_10_np, recomend_MF_top_10_np)
mse_top_20_NN = mean_squared_error(real_top_20_np, recomend_NN_top_20_np)
mse_top_10_NN = mean_squared_error(real_top_10_np, recomend_NN_top_10_np)
rmse_top_20_MF = np.sqrt(mean_squared_error(real_top_20_np, recomend_MF_top_20_np))
rmse_top_10_MF = np.sqrt(mean_squared_error(real_top_10_np, recomend_MF_top_10_np))
rmse_top_20_NN = np.sqrt(mean_squared_error(real_top_20_np, recomend_NN_top_20_np))
rmse_top_10_NN = np.sqrt(mean_squared_error(real_top_10_np, recomend_NN_top_10_np))
names = ['mse top 20 MF','mse top 10 MF','mse top 20 NN','mse top 10 NN','RMSE top 20 MF','RMSE top 10 MF','RMSE top 20 NN','RMSE top 10 NN']
rmse_df = pd.DataFrame([mse_top_20_MF,mse_top_10_MF,mse_top_20_NN,mse_top_10_NN,rmse_top_20_MF,rmse_top_10_MF,rmse_top_20_NN,rmse_top_10_NN],columns = ['rmse'])
rmse_df['names'] = names
print(f'MSE top 20 Matrix Factorization: {mse_top_20_MF}')
print(f'MSE top 10 Matrix Factorization: {mse_top_10_MF}')
print(f'MSE top 20 Hybrid Recomender: {mse_top_20_NN}')
print(f'MSE top 10 Hybrid Recomender: {mse_top_10_NN}')
print(f'RMSE top 20 Matrix Factorization: {rmse_top_20_MF}')
print(f'RMSE top 10 Matrix Factorization: {rmse_top_10_MF}')
print(f'RMSE top 20 Hybrid Recomender: {rmse_top_20_NN}')
print(f'RMSE top 10 Hybrid Recomender: {rmse_top_10_NN}')
MSE top 20 Matrix Factorization: 0.8275074080138138 MSE top 10 Matrix Factorization: 1.181432364808971 MSE top 20 Hybrid Recomender: 0.15716342794316454 MSE top 10 Hybrid Recomender: 0.019125064449223572 RMSE top 20 Matrix Factorization: 0.9096743417365436 RMSE top 10 Matrix Factorization: 1.0869371485090438 RMSE top 20 Hybrid Recomender: 0.39643842894346726 RMSE top 10 Hybrid Recomender: 0.1382933998758566
import plotly.express as px
fig = px.bar(rmse_df, x='rmse',y = 'names')
fig.show()
We can define precision as the ratio tp/(tp+fp). tp is the bumber of true positives and fp the number of false positives.Intuitively, the classifier's precision is its capacity not to classify a negative sample as positive.
1 is the best value, while 0 is the worst.
npRecommend = full_df_clean[full_df_clean['user_id']==2].to_numpy()
npRecommend[:,0] = myNN(torch.Tensor(npRecommend.astype(np.float32)[:,1:]).to(device)).to(device).cpu().detach().numpy().reshape(-1)
recomend_NN_df = pd.DataFrame(npRecommend,columns =['rating', 'user_id', 'movie_id', 'sentiment_score', 'unknown', 'Action', 'Adventure',
'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
'Fantasy', 'Film-noir', 'Horror', 'Musical', 'Mystery', 'Romance',
'sci_fi', 'Thriller', 'War', 'Western'])
recomend_NN_df['rating']= recomend_NN_df['rating']*10
recomend_NN_top_20 = recomend_NN_df.sort_values(by=['rating'], ascending=False)[:20]
recomend_NN_top_20_np = recomend_NN_top_20['movie_id'].to_numpy()
recomend_NN_top_10 = recomend_NN_top_20[:10]
recomend_NN_top_10_np = recomend_NN_top_10['movie_id'].to_numpy()
recomend_NN_top_20
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 59 | 5.14832 | 2 | 275 | 0.358261 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 34 | 5.092069 | 2 | 291 | 0.158954 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 23 | 5.090956 | 2 | 282 | -0.1048 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 50 | 5.09086 | 2 | 100 | 0.098552 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 9 | 5.039878 | 2 | 280 | 0.319471 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 53 | 5.024852 | 2 | 289 | 0.089923 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 5.01139 | 2 | 251 | 0.24655 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 4.99816 | 2 | 292 | 0.188411 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 4.994484 | 2 | 290 | 0.0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 56 | 4.988626 | 2 | 278 | 0.24541 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 52 | 4.978828 | 2 | 285 | -0.19374 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 4.978101 | 2 | 315 | 0.030111 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 19 | 4.975234 | 2 | 299 | 0.140425 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 8 | 4.971454 | 2 | 13 | 0.6249 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 58 | 4.949393 | 2 | 286 | 0.2232 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 51 | 4.947826 | 2 | 127 | 0.246348 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 55 | 4.939068 | 2 | 272 | 0.243833 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 11 | 4.93773 | 2 | 308 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 27 | 4.927318 | 2 | 242 | -0.01544 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 25 | 4.922208 | 2 | 258 | 0.188986 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
20 rows × 23 columns
recomend_MF = nR_df.iloc[[0]].T
recomend_MF['movie_id'] = recomend_MF.index
recomend_MF = recomend_MF.reset_index(level=0)
recomend_MF_top_20 = recomend_MF.sort_values(by=[1], ascending=False)[:20]
recomend_MF_top_10 = recomend_MF_top_20[:10]
recomend_MF_top_20_np = recomend_MF_top_20['movie_id'].to_numpy()
recomend_MF_top_10_np = recomend_MF_top_10['movie_id'].to_numpy()
recomend_MF_top_20
| index | 1 | movie_id | |
|---|---|---|---|
| 1678 | 1679 | 5.585002 | 1679 |
| 1448 | 1449 | 5.305205 | 1449 |
| 1079 | 1080 | 5.157463 | 1080 |
| 356 | 357 | 5.077536 | 357 |
| 118 | 119 | 5.069472 | 119 |
| 656 | 657 | 5.013197 | 657 |
| 1573 | 1574 | 5.011453 | 1574 |
| 512 | 513 | 4.992326 | 513 |
| 126 | 127 | 4.985257 | 127 |
| 1599 | 1600 | 4.971173 | 1600 |
| 1497 | 1498 | 4.965852 | 1498 |
| 1466 | 1467 | 4.961076 | 1467 |
| 1537 | 1538 | 4.956167 | 1538 |
| 473 | 474 | 4.948204 | 474 |
| 479 | 480 | 4.931411 | 480 |
| 1628 | 1629 | 4.892277 | 1629 |
| 63 | 64 | 4.886407 | 64 |
| 427 | 428 | 4.873402 | 428 |
| 442 | 443 | 4.868379 | 443 |
| 168 | 169 | 4.867561 | 169 |
recomend_MF_top_20[1] = recomend_MF_top_20[1].apply(lambda x: int(x))
recomend_MF_top_10 = recomend_MF_top_20[:10]
recomend_MF_top_20_np = recomend_MF_top_20[1].to_numpy()
recomend_MF_top_10_np = recomend_MF_top_10[1].to_numpy()
recomend_MF_top_20
| index | 1 | movie_id | |
|---|---|---|---|
| 1678 | 1679 | 5 | 1679 |
| 1448 | 1449 | 5 | 1449 |
| 1079 | 1080 | 5 | 1080 |
| 356 | 357 | 5 | 357 |
| 118 | 119 | 5 | 119 |
| 656 | 657 | 5 | 657 |
| 1573 | 1574 | 5 | 1574 |
| 512 | 513 | 4 | 513 |
| 126 | 127 | 4 | 127 |
| 1599 | 1600 | 4 | 1600 |
| 1497 | 1498 | 4 | 1498 |
| 1466 | 1467 | 4 | 1467 |
| 1537 | 1538 | 4 | 1538 |
| 473 | 474 | 4 | 474 |
| 479 | 480 | 4 | 480 |
| 1628 | 1629 | 4 | 1629 |
| 63 | 64 | 4 | 64 |
| 427 | 428 | 4 | 428 |
| 442 | 443 | 4 | 443 |
| 168 | 169 | 4 | 169 |
recomend_NN_top_20['rating']= recomend_NN_top_20['rating'].apply(lambda x: int(x))
recomend_NN_top_10 = recomend_NN_top_20[:10]
recomend_NN_top_20_np = recomend_NN_top_20['rating'].to_numpy()
recomend_NN_top_10_np = recomend_NN_top_10['rating'].to_numpy()
recomend_NN_top_20
| rating | user_id | movie_id | sentiment_score | unknown | Action | Adventure | Animation | Children | Comedy | ... | Fantasy | Film-noir | Horror | Musical | Mystery | Romance | sci_fi | Thriller | War | Western | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 59 | 5 | 2 | 275 | 0.358261 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 34 | 5 | 2 | 291 | 0.158954 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 23 | 5 | 2 | 282 | -0.1048 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 50 | 5 | 2 | 100 | 0.098552 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 9 | 5 | 2 | 280 | 0.319471 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 53 | 5 | 2 | 289 | 0.089923 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 5 | 2 | 251 | 0.24655 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 4 | 2 | 292 | 0.188411 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 4 | 2 | 290 | 0.0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 56 | 4 | 2 | 278 | 0.24541 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 52 | 4 | 2 | 285 | -0.19374 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 4 | 2 | 315 | 0.030111 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 19 | 4 | 2 | 299 | 0.140425 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 8 | 4 | 2 | 13 | 0.6249 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 58 | 4 | 2 | 286 | 0.2232 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 51 | 4 | 2 | 127 | 0.246348 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 55 | 4 | 2 | 272 | 0.243833 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 11 | 4 | 2 | 308 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 27 | 4 | 2 | 242 | -0.01544 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 25 | 4 | 2 | 258 | 0.188986 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
20 rows × 23 columns
real_top_20_np_movies = real_top_20['rating'].to_numpy()
real_top_10_np_movies = real_top_20_np_movies[:10]
real_top_20_np_movies
array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4],
dtype=int64)
Using the 'micro' parameter, the function will calculate metrics globally by counting the total true positives, false negatives and false positives. When true positive + false positive == 0, the functino will return 0 and raises UndefinedMetricWarning. This behavior can be modified with zero_division
from sklearn.metrics import precision_score
precision_MF_top_20 = precision_score(real_top_20_np_movies, recomend_MF_top_20_np, average='micro')
precision_NN_top_20 = precision_score(real_top_20_np_movies, recomend_NN_top_20_np,average='micro')
precision_MF_top_10 = precision_score(real_top_10_np_movies, recomend_MF_top_10_np,average='micro')
precision_NN_top_10 = precision_score(real_top_10_np_movies, recomend_NN_top_10_np,average='micro')
precision = [precision_MF_top_20,precision_NN_top_20,precision_MF_top_10,precision_NN_top_10]
names = ['precision_MF_top_20','precision_NN_top_20','precision_MF_top_10','precision_NN_top_10']
precision_df = pd.DataFrame(precision,columns = ['precision'])
precision_df['names'] = names
print(f'precision_MF_top_20: {precision_MF_top_20}')
print(f'precision_NN_top_20: {precision_NN_top_20}')
print(f'precision_MF_top_10: {precision_MF_top_10}')
print(f'precision_NN_top_10: {precision_NN_top_10}')
precision_MF_top_20: 0.7 precision_NN_top_20: 0.7 precision_MF_top_10: 0.7 precision_NN_top_10: 0.7
fig = px.bar(precision_df, x='precision',y = 'names')
fig.show()
The classifier's capacity to locate all the positive samples is known as recall. We can define recall as the ration of tp / (tp + fn) where fn is the quantity of false negatives and tp the number of true positives
1 is the best value, while 0 is the worst.
from sklearn.metrics import recall_score
recall_MF_top_20 = recall_score(real_top_20_np_movies, recomend_MF_top_20_np, average='micro',zero_division=1)
recall_NN_top_20 = recall_score(real_top_20_np_movies, recomend_NN_top_20_np,average='micro',zero_division=1)
recall_MF_top_10 = recall_score(real_top_10_np_movies, recomend_MF_top_10_np,average='micro',zero_division=1)
recall_NN_top_10 = recall_score(real_top_10_np_movies, recomend_NN_top_10_np,average='micro',zero_division=1)
recall = [recall_MF_top_20,recall_NN_top_20,recall_MF_top_10,recall_NN_top_10]
names = ['recall_MF_top_20','recall_NN_top_20','recall_MF_top_10','recall_NN_top_10']
recall_df = pd.DataFrame(recall,columns = ['recall'])
recall_df['names'] = names
print(f'recall_MF_top_20: {recall_MF_top_20}')
print(f'recall_NN_top_20: {recall_NN_top_20}')
print(f'recall_MF_top_10: {recall_MF_top_10}')
print(f'recall_NN_top_10: {recall_NN_top_10}')
recall_MF_top_20: 0.7 recall_NN_top_20: 0.7 recall_MF_top_10: 0.7 recall_NN_top_10: 0.7
fig = px.bar(recall_df, x='recall',y = 'names')
fig.show()
The F1 score can be thought of as a harmonic mean of precision and recall, with the best value being 1 and the poorest being 0. Precision and recall both contribute equally in terms of percentage to the F1 score. We can calculate the F1 score with the formula: F1 = 2 (precision recall) / (precision + recall)
from sklearn.metrics import f1_score
f1_MF_top_20 = f1_score(real_top_20_np_movies, recomend_MF_top_20_np, average='micro',zero_division=1)
f1_NN_top_20 = f1_score(real_top_20_np_movies, recomend_NN_top_20_np,average='micro',zero_division=1)
f1_MF_top_10 = f1_score(real_top_10_np_movies, recomend_MF_top_10_np,average='micro',zero_division=1)
f1_NN_top_10 = f1_score(real_top_10_np_movies, recomend_NN_top_10_np,average='micro',zero_division=1)
f1 = [f1_MF_top_20,f1_NN_top_20,f1_MF_top_10,f1_NN_top_10]
names = ['f1_MF_top_20','f1_NN_top_20','f1_MF_top_10','f1_NN_top_10']
f1_df = pd.DataFrame(f1,columns = ['f1'])
f1_df['names'] = names
print(f'f1_MF_top_20: {f1_MF_top_20}')
print(f'f1_NN_top_20: {f1_NN_top_20}')
print(f'f1_MF_top_10: {f1_MF_top_10}')
print(f'f1_NN_top_10: {f1_NN_top_10}')
f1_MF_top_20: 0.7 f1_NN_top_20: 0.7 f1_MF_top_10: 0.7 f1_NN_top_10: 0.7
fig = px.bar(f1_df, x='f1',y = 'names')
fig.show()